import sentencepiece


sp = sentencepiece.SentencePieceProcessor()
sp.load("lulu/tokenizer.model")

# # 获取词表中词汇的数量
# vocab_size = sp.get_piece_size()
# # 打印词表中的所有词汇
# for i in range(vocab_size):
#     piece = sp.id_to_piece(i)
#     print(f'Index: {i}, Piece: {piece}')

text = "好好学习，天天向上。"
pieces = sp.encode_as_pieces(text)   # ['▁', '好好学习', ',', '天天', '向上', '。']
ids = sp.encode_as_ids(text)         # [55582, 25893, 55580, 4920, 10350, 55583]
print(pieces)
print(ids)

print(sp.decode_pieces(pieces))      # 好好学习,天天向上。
print(sp.decode_ids(ids))            # 好好学习,天天向上。